In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import plotly.figure_factory as ff
import warnings
from scipy import stats

warnings.filterwarnings('ignore')

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Loading Dataset

In [2]:
df = pd.read_excel("Assignment.xlsx")
del df['Unnamed: 0']
df.columns = df.iloc[0]
df = df.drop(df.index[0])
df = df.reset_index()
del df['index']

Preprocessing, cleaning

In [3]:
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.set_index(df['datetime'])
In [4]:
# percentage of data loss if I drop the null values
str((1-df.dropna().shape[0]/df.shape[0])*100)+"%"
Out[4]:
'13.979166666666664%'

Considering the fact that data loss is significant after dropping the NUll Values. I'll fill it with mean of values of that column

In [5]:
df = df.infer_objects()
df = df.fillna(df.mean())
del df['datetime']
In [6]:
df = df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]

Plotting the range of tempratures

In [7]:
# Add histogram data
x1 = df['t1'].as_matrix().astype(float)
x2 = df['t2'].as_matrix().astype(float)
x3 = df['t3'].as_matrix().astype(float)
x4 = df['t4'].as_matrix().astype(float)

# Group data together
hist_data = [x1, x2, x3, x4]

group_labels = ['Refrigerator 1', 'Refrigerator 2', 'Refrigerator 3', 'Refrigerator 4']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=.2, )
fig.show()
In [8]:
px.box(df,y='t1',points="all")
In [9]:
px.box(df,y='t2',points="all")
In [10]:
px.box(df,y='t3',points="all")
In [11]:
px.box(df,y='t4',points="all")
In [17]:
temp = df.reset_index()
px.line(temp.head(100), x='datetime', y='p1')
In [22]:
px.line(temp.head(200), x='datetime', y='p2')
In [23]:
px.line(temp.head(200), x='datetime', y='p3')
In [24]:
px.line(temp.head(200), x='datetime', y='p4')
In [36]:
(df.t1[(df.t1 > 1) & (df.t1 < 4)].shape[0]/ df.t1.shape[0])*100
Out[36]:
56.08788301859063
In [37]:
(df.t2[(df.t2 > 1) & (df.t2 < 4)].shape[0]/ df.t2.shape[0])*100
Out[37]:
33.470497464912924
In [38]:
(df.t3[(df.t3 > 1) & (df.t3 < 4)].shape[0]/ df.t3.shape[0])*100
Out[38]:
74.06863105297965
In [39]:
(df.t4[(df.t4 > 1) & (df.t4 < 4)].shape[0]/ df.t4.shape[0])*100
Out[39]:
88.38268792710706
In [53]:
avg_daily = temp.resample('D', on='datetime')['p1','p2','p3','p4'].sum().sum()/10
In [84]:
temp1 = pd.DataFrame(avg_daily)
In [85]:
temp1.columns = ['KWd']
In [86]:
temp1 = temp1.reset_index()
In [87]:
temp1.columns = ['Units', 'Avg_Consumption']
In [88]:
px.bar(temp1, x = 'Units', y='Avg_Consumption').show()
In [89]:
temp1
Out[89]:
Units Avg_Consumption
0 p1 1540.322836
1 p2 2336.617804
2 p3 2228.050405
3 p4 1908.438070
In [ ]: